htmltools::includeHTML("./index.html")
About

Summary

Analyses and results of single-cell RNA sequencing data from CD14-enriched peripheral blood cells (primarily monocytes) from Parkinson’s Disease patients and age-matched controls.



shiny::h3("Results Directory: ", params$resultsPath)

Results Directory: ./Results

htmltools::includeHTML("./scRNAseq_Preprocessing.html")
Preprocessing

Setup

# Import functions
root <- "./"  
source(file.path(root,"general_functions.R"))
# import_parameters(params)

./Results

Load Libraries

library(Seurat)
library(dplyr)
library(gridExtra)
library(knitr) 
library(plotly)
library(ggplot2)
library(shiny) 
library(DT) 

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.5.1      shiny_1.2.0   plotly_4.8.0  knitr_1.21    gridExtra_2.3
##  [6] dplyr_0.7.8   Seurat_2.3.4  Matrix_1.2-15 cowplot_0.9.4 ggplot2_3.1.0
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15          colorspace_1.4-0    class_7.3-15       
##   [4] modeltools_0.2-22   ggridges_0.5.1      mclust_5.4.2       
##   [7] htmlTable_1.13.1    base64enc_0.1-3     rstudioapi_0.9.0   
##  [10] proxy_0.4-22        npsurv_0.4-0        flexmix_2.3-14     
##  [13] bit64_0.9-7         mvtnorm_1.0-8       codetools_0.2-16   
##  [16] splines_3.5.1       R.methodsS3_1.7.1   lsei_1.2-0         
##  [19] robustbase_0.93-3   jsonlite_1.6        Formula_1.2-3      
##  [22] ica_1.0-2           cluster_2.0.7-1     kernlab_0.9-27     
##  [25] png_0.1-7           R.oo_1.22.0         compiler_3.5.1     
##  [28] httr_1.4.0          backports_1.1.3     assertthat_0.2.0   
##  [31] lazyeval_0.2.1      later_0.8.0         lars_1.2           
##  [34] acepack_1.4.1       htmltools_0.3.6     tools_3.5.1        
##  [37] bindrcpp_0.2.2      igraph_1.2.2        gtable_0.2.0       
##  [40] glue_1.3.0          reshape2_1.4.3      RANN_2.6.1         
##  [43] Rcpp_1.0.0          trimcluster_0.1-2.1 gdata_2.18.0       
##  [46] ape_5.2             nlme_3.1-137        iterators_1.0.10   
##  [49] fpc_2.1-11.1        gbRd_0.4-11         lmtest_0.9-36      
##  [52] xfun_0.4            stringr_1.4.0       mime_0.6           
##  [55] irlba_2.3.3         gtools_3.8.1        DEoptimR_1.0-8     
##  [58] MASS_7.3-51.1       zoo_1.8-4           scales_1.0.0       
##  [61] promises_1.0.1      doSNOW_1.0.16       parallel_3.5.1     
##  [64] RColorBrewer_1.1-2  yaml_2.2.0          reticulate_1.10    
##  [67] pbapply_1.4-0       rpart_4.1-13        segmented_0.5-3.0  
##  [70] latticeExtra_0.6-28 stringi_1.2.4       foreach_1.4.4      
##  [73] checkmate_1.9.1     caTools_1.17.1.1    bibtex_0.4.2       
##  [76] Rdpack_0.10-1       SDMTools_1.1-221    rlang_0.3.1        
##  [79] pkgconfig_2.0.2     dtw_1.20-1          prabclus_2.2-7     
##  [82] bitops_1.0-6        evaluate_0.12       lattice_0.20-38    
##  [85] ROCR_1.0-7          purrr_0.3.0         bindr_0.1.1        
##  [88] htmlwidgets_1.3     bit_1.1-14          tidyselect_0.2.5   
##  [91] plyr_1.8.4          magrittr_1.5        R6_2.3.0           
##  [94] snow_0.4-3          gplots_3.0.1.1      Hmisc_4.2-0        
##  [97] pillar_1.3.1        foreign_0.8-71      withr_2.1.2        
## [100] fitdistrplus_1.0-14 mixtools_1.1.0      survival_2.43-3    
## [103] nnet_7.3-12         tsne_0.1-3          tibble_2.0.1       
## [106] crayon_1.3.4        hdf5r_1.0.1         KernSmooth_2.23-15 
## [109] rmarkdown_1.11      grid_3.5.1          data.table_1.12.0  
## [112] metap_1.1           digest_0.6.18       diptest_0.75-7     
## [115] xtable_1.8-3        httpuv_1.4.5.1      tidyr_0.8.2        
## [118] R.utils_2.7.0       stats4_3.5.1        munsell_0.5.0      
## [121] viridisLite_0.3.0
print(paste("Seurat ", packageVersion("Seurat")))
## [1] "Seurat  2.3.4"

Load Data

## ! IMPORTANT! Must not setwd to local path when launching on cluster
# setwd("~/Desktop/PD_scRNAseq/") 
load(file.path(root, "Data/seurat_object_add_HTO_ids.Rdata"))
DAT <- seurat.obj  
rm(seurat.obj)

Pre-filtered Dimensions

DAT
## An object of class seurat in project RAJ_13357 
##  24914 genes across 22113 samples.

Clean Metadata

Add Metadata

metadata <- read.table(file.path(root,"Data/meta.data4.tsv"))
createDT( metadata, caption = "Metadata")  
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
# Make AgeGroups
makeAgeGroups <- function(){
  dim(metadata)
  getMaxRound <- function(vals=metadata$Age, unit=10)unit*ceiling((max(vals)/unit))
  getMinRound <- function(vals=metadata$Age, unit=10)unit*floor((min(vals)/unit)) 
   
  ageBreaks = c(seq(getMinRound(), getMaxRound(), by = 10), getMaxRound()+10)
  AgeGroupsUniq <- c()
  for (i in 1:(length(ageBreaks)-1)){ 
    AgeGroupsUniq <- append(AgeGroupsUniq, paste(ageBreaks[i],ageBreaks[i+1], sep="-")) 
  } 
  data.table::setDT(metadata,keep.rownames = T,check.names = F)[, AgeGroups := cut(Age, 
                                  breaks = ageBreaks, 
                                  right = F, 
                                  labels = AgeGroupsUniq,
                                  nclude.lowest=T)]
  metadata <- data.frame(metadata)
  unique(metadata$AgeGroups)
  head(metadata)
  dim(metadata)
  return(metadata)
}
# metadata <- makeAgeGroups()

DAT <- AddMetaData(object = DAT, metadata = metadata)  
# Get rid of any NAs (cells that don't match up with the metadata) 
if(params$subsetCells==F){
  DAT <- FilterCells(object = DAT,  subset.names = "nGene", low.thresholds = 0)
} else {DAT <- FilterCells(object = DAT,  subset.names = "nGene", low.thresholds = 0,
                    # Subset for testing 
                    cells.use = DAT@cell.names[0:params$subsetCells]
                    )
}  

Filter & Normalize Data

Subset Genes by Biotype

Include only subsets of genes by type. Biotypes from: https://useast.ensembl.org/info/genome/genebuild/biotypes.html

subsetBiotypes <- function(DAT, subsetGenes){
  if( subsetGenes!=F ){
    cat(paste("Subsetting genes:",subsetGenes, "\n"))
    # If the gene_biotypes file exists, import csv. Otherwise, get from biomaRt
    if(file_test("-f", file.path(root, "Data/gene_biotypes.csv"))){
      biotypes <- read.csv(file.path(root, "Data/gene_biotypes.csv"))
    }
    else {
      ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
                       dataset="hsapiens_gene_ensembl") 
      ensembl <- useDataset(mart = ensembl, dataset = "hsapiens_gene_ensembl")
      listFilters(ensembl)
      listAttributes(ensembl)   
      biotypes <- getBM(attributes=c("hgnc_symbol", "gene_biotype"), filters="hgnc_symbol",
            values=row.names(DAT@data), mart=ensembl) 
      write.csv(biotypes, file.path(root,"Data/gene_biotypes.csv"), quote=F, row.names=F)
    } 
    # Subset data by creating new Seurat object (annoying but necessary)
    geneSubset <- biotypes[biotypes$gene_biotype==subsetGenes,"hgnc_symbol"] 
    
    cat(paste(dim(DAT@raw.data[geneSubset, ])[1],"/", dim(DAT@raw.data)[1], 
                "genes are", subsetGenes))
    # Add back into DAT 
    subset.matrix <- DAT@raw.data[geneSubset, ] # Pull the raw expression matrix from the original Seurat object containing only the genes of interest
    DAT_sub <- CreateSeuratObject(subset.matrix) # Create a new Seurat object with just the genes of interest
    orig.ident <- row.names(DAT@meta.data) # Pull the identities from the original Seurat object as a data.frame
    DAT_sub <- AddMetaData(object = DAT_sub, metadata = DAT@meta.data) # Add the idents to the meta.data slot
    DAT_sub <- SetAllIdent(object = DAT_sub, id = "ident") # Assign identities for the new Seurat object
    DAT <- DAT_sub
    rm(list = c("DAT_sub","geneSubset", "subset.matrix", "orig.ident")) 
  } 
  return(DAT)
}

DAT <- subsetBiotypes(DAT, params$subsetGenes)
## Subsetting genes: protein_coding 
## 14827 / 24914 genes are protein_coding

Subset Cells

Filter by cells, normalize , filter by gene variability.

cat("Total Cells:", length(DAT@cell.names), "\n")
## Total Cells: 27863
DAT <- FilterCells(object = DAT, subset.names = c("nGene", "percent.mito"), 
    low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
cat("Filtered Cells:", length(DAT@cell.names))
## Filtered Cells: 495
DAT <- NormalizeData(object = DAT, normalization.method = "LogNormalize", 
    scale.factor = 10000)

Subset Genes by Variance

Important!: * In ScaleData… + Specify do.par = F (unless you have parallel processing set up properly, this will cause your script to crash) + Specify num.cores = nCores (to use all available cores, determined by parallel::detectCores())

Regress out: number of unique transcripts (nUMI), % mitochondrial transcripts (percent.mito)

# Store the top most variable genes in @var.genes
DAT <- FindVariableGenes(object = DAT, mean.function = ExpMean, dispersion.function = LogVMR,
    x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

cat("Total Genes:", length(row.names(DAT@raw.data)), "\n")
## Total Genes: 14827
cat("Highly Variable Genes:", length(DAT@var.genes))
## Highly Variable Genes: 4560
# IMPORTANT!: Must set do.par=T and num.cors = n for large datasets being processed on computing clusters
# IMPORTANT!: Use only the var.genes identified by 'FindVariableGenes' as the 'gene.use' arg in 'ScaleData'
## This will greatly reduced the computational load.

# Ensure CD14 and CD16 are included
appendedGenes <- c(DAT@var.genes, "CD14", "FCGR3A")
DAT <- ScaleData(object = DAT, genes.use = appendedGenes , vars.to.regress = c("nUMI", "percent.mito"), 
                  do.par = T, num.cores = params$nCores)
## Regressing out: nUMI, percent.mito
## 
## Time Elapsed:  4.37511205673218 secs
## Scaling data matrix

Filtered Dimensions

DAT
## An object of class seurat in project SeuratProject 
##  14827 genes across 495 samples.

Diagnostic Plots

Violin Plots

vp <- VlnPlot(object = DAT, features.plot = c("nGene", "nUMI", "percent.mito"),nCol = 3, do.return = T) %>% + ggplot2::aes(alpha=0.5)
vp

Gene Plots

percent.mito plot

# results = 'hide', fig.show='hide'
# par(mfrow = c(1, 2))
# do.hover <- ifelse(interactive==T, T, F)
GenePlot(object = DAT, gene1 = "nUMI", gene2 = "percent.mito", pch.use=20) 

         #do.hover=do.hover, data.hover = "mut")

nGene plot

# , results = 'hide', fig.show='hide'
# do.hover <-ifelse(interactive==T, T, F)
GenePlot(object = DAT, gene1 = "nUMI", gene2 = "nGene", pch.use=20)

         #do.hover=do.hover, data.hover = "mut")

Dimensionality Reduction & Clustering

PCA

ProjectPCA scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don’t use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection. The results of the projected PCA can be explored by setting use.full=T in the functions above

  • Other Dim Reduction Methods in Seurat
  • RunCCA()
  • RunMultiCCA()
  • RunDiffusion()
  • RunPHATE()
  • RunICA()
# Run PCA with only the top most variables genes
DAT <- RunPCA(object = DAT, pc.genes = DAT@var.genes, do.print=F, verbose=F) #, pcs.print = 1:5,  genes.print = 5
# Store in Seurat object so you don't have to recalculate it for the tSNE/UMAP steps
DAT <- ProjectPCA(object = DAT, do.print=F) 

VizPCA

VizPCA(object = DAT, pcs.use = 1:2)

PCHeatmaps

# 'PCHeatmap' is a wrapper for heatmap.2  
PCHeatmap(object = DAT, pc.use = 1:12,do.balanced=T, label.columns=F, use.full=F)   # cells.use = 500, 

Significant PCs

Determine statistically significant PCs for further analysis. NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time

#DAT <- JackStraw(object = DAT, num.replicate = 100, display.progress = FALSE)
PCElbowPlot(object = DAT)

Find Cell Clusters

  • We first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To cluster the cells, we apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function.
  • The clustering approach in FindClusters was heavily inspired by recent manuscripts which applied graph-based clustering approaches to scRNA-seq data SNN-Cliq, Xu and Su, Bioinformatics, 2015 and CyTOF data PhenoGraph, Levine et al., Cell, 2015.
  • To further increase speed, you can employ an approximate nearest neighbor search via the RANN package by increasing the nn.eps + parameter. Setting this at 0 (the default) represents an exact neighbor search.
  • By default, we perform 100 random starts for clustering and select the result with highest modularity. You can lower this through the n.start parameter to reduce clustering time.
  • Algorithm for modularity optimization (1 = original Louvain algorithm; 2 = Louvain algorithm with multilevel refinement; 3 = SLM algorithm).

  • On Resolution
  • The FindClusters function implements the procedure, and contains a resolution parameter that sets the ‘granularity’ of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.6-1.2 typically returns good results for single cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters are saved in the object@ident slot.

DAT <- RunTSNE(object=DAT,  reduction.use = "pca", dims.use = 1:10, do.fast = TRUE,
                perplexity = params$perplexity, tsne.method = "Rtsne", num_threads=params$nCores, verbose=F) #   FItSNE 

# TRY DIFFERENT RESOLUTIONS
DAT <- StashIdent(object = DAT, save.name = "pre_clustering") 
# DAT <- SetAllIdent(object = DAT, id = "pre_clustering") 

DAT <- FindClusters(object = DAT, reduction.type = "pca", dims.use = 1:10, algorithm = 1,
                     resolution = params$resolution, print.output = F, save.SNN = T,
                     n.start = 10, nn.eps = 0.5, plot.SNN = T, force.recalc=T) 

PrintFindClustersParams(object = DAT) 
## Parameters used in latest FindClusters calculation run on: 2019-02-13 17:27:26
## =============================================================================
## Resolution: 0.6
## -----------------------------------------------------------------------------
## Modularity Function    Algorithm         n.start         n.iter
##      1                   1                 10              10
## -----------------------------------------------------------------------------
## Reduction used          k.param          prune.SNN
##      pca                 30                0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10
DAT <- StashIdent(object = DAT, save.name = "post_clustering") 

PCA plot

# do.hover <-ifelse(interactive==T, T, F)
PCAPlot(object = DAT, dim.1 = 1, dim.2 = 2, group.by="post_clustering")#, do.hover=do.hover, data.hover="mut")

UMAP

Additional UMAP arguments detailed here: https://umap-learn.readthedocs.io/en/latest/api.html#module-umap.umap_

# cat(print(mem_used()))
DAT <- RunUMAP(object = DAT, reduction.use = "pca", dims.use = 1:10, verbose=TRUE, num_threads=params$nCores) # , num_threads=0
# cat(print(mem_used()))
# Plot results
DimPlot(object = DAT, reduction.use = 'umap')

t-SNE

As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument.

Important!: Specify num_threads=0 in ‘RunTSNE’ to use all available cores.

“FItSNE”, a new fast implementation of t-SNE, is also available through RunTSNE. However FItSNE must first be setup on your computer.

labSize <- 12 

customColors <- function(var="post_clustering", palette="Set1"){
  add.alpha <- function(col, alpha=1){ 
    if(missing(col))
      stop("Please provide a vector of colours.")
     apply(sapply(col, col2rgb)/255, 2, 
                       function(x) 
                         rgb(x[1], x[2], x[3], alpha=alpha))  
  } 
  cluster_colors  <- RColorBrewer::brewer.pal( length(unique(DAT@meta.data[var])), palette)
  cluster_colors_transparent <- add.alpha(cluster_colors, .5) %>% as.character()
  return(cluster_colors_transparent)
}

# Try t-SNE at different perplexities
for (i in c(params$perplexity,5,20,30,100)){
   cat('\n')   
   cat("### t-SNE: perplexity =",i,"\n") 
   DAT <- RunTSNE(object=DAT,  reduction.use = "pca", dims.use = 1:10, do.fast = TRUE,
                perplexity = i, tsne.method = "Rtsne", num_threads=params$nCores, verbose=F) #   FItSNE
  tsnePlot <- TSNEPlot(object = DAT, do.label=T, label.size = labSize, do.return=T) + 
    scale_color_brewer( palette = "Set1", aesthetics = aes(alpha=.5))
  print(tsnePlot)
  cat('\n')   
} 

t-SNE: perplexity = 30

t-SNE: perplexity = 5

t-SNE: perplexity = 20

t-SNE: perplexity = 30

t-SNE: perplexity = 100

t-SNE & Metadata Plots

tSNE_metadata_plot <- function(var, labSize=12){ 
  # Metadata plot  
  p1 <- TSNEPlot(DAT, do.return = T,  do.label = T,  group.by = var,label.size = labSize,
                 plot.title=paste(var), vector.friendly=T) +
    theme(legend.position = "top") + 
    scale_color_brewer( palette = "Dark2",  aesthetics = aes(alpha=.5)) 
     
  # t-SNE clusters plot
  p2 <- TSNEPlot(DAT, do.return = T, do.label = T,label.size = labSize,
                 plot.title=paste("Unsupervised Clusters"), vector.friendly=T)  +
    theme(legend.position = "top") + 
    scale_color_brewer( palette = "Set1", aesthetics = aes(alpha=.5))  
  print(plot_grid(p1,p2))
     
}   
# Iterate plots over metadata variables
metaVars <- c("dx","mut","Gender","Age", "ID")
for (var in metaVars){
  cat('\n')
  cat("### t-SNE Metadata plot for ",var, "\n") 
  tSNE_metadata_plot(var)
  cat('\n') 
} 

t-SNE Metadata plot for dx

t-SNE Metadata plot for mut

t-SNE Metadata plot for Gender

t-SNE Metadata plot for Age

t-SNE Metadata plot for ID

Save Results

save.image(file.path("scRNAseq_results.RData"))
htmltools::includeHTML("./scRNAseq_CharacterizeClusters.html")
Characterize Clusters

Setup

# Import functions
root = "./"
source(file.path(root,"general_functions.R"))
import_parameters(params)
## **** __Utilized Cores__ **** = 2$subsetGenes
## [1] "protein_coding"
## 
## $subsetCells
## [1] 500
## 
## $resolution
## [1] 0.6
## 
## $resultsPath
## [1] "./Results"
## 
## $nCores
## [1] 2
## 
## $perplexity
## [1] 30
load(file.path("scRNAseq_results.RData"))

Load Libraries

library(Seurat)
library(dplyr)
library(gridExtra)
library(knitr) 
library(plotly)
library(ggplot2)
library(viridis)
library(reshape2)
library(shiny) 
library(ggrepel)
library(DT) 
library(ComplexHeatmap); #BiocManager::install("ComplexHeatmap") 
  
## Install Bioconductor
#  if (!requireNamespace("BiocManager"))
#     install.packages("BiocManager") 
library(biomaRt) # BiocManager::install(c("biomaRt"))
# library(DESeq2) # BiocManager::install(c("DESeq2"))
library(enrichR) #BiocManager::install("enrichR")

library(monocle) #BiocManager::install("monocle")
# BiocManager::install("DelayedMatrixStats")
# BiocManager::install("org.Mm.eg.db") 
library(org.Hs.eg.db)
library(garnett) # devtools::install_github("cole-trapnell-lab/garnett") 
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] splines   stats4    parallel  grid      stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] garnett_0.1.1         org.Hs.eg.db_3.7.0    AnnotationDbi_1.44.0 
##  [4] IRanges_2.16.0        S4Vectors_0.20.1      monocle_2.10.1       
##  [7] DDRTree_0.1.5         irlba_2.3.3           VGAM_1.0-6           
## [10] Biobase_2.42.0        BiocGenerics_0.28.0   enrichR_1.0          
## [13] biomaRt_2.38.0        ComplexHeatmap_1.20.0 DT_0.5.1             
## [16] ggrepel_0.8.0         shiny_1.2.0           reshape2_1.4.3       
## [19] viridis_0.5.1         viridisLite_0.3.0     plotly_4.8.0         
## [22] knitr_1.21            gridExtra_2.3         dplyr_0.7.8          
## [25] Seurat_2.3.4          Matrix_1.2-15         cowplot_0.9.4        
## [28] ggplot2_3.1.0        
## 
## loaded via a namespace (and not attached):
##   [1] snow_0.4-3           backports_1.1.3      circlize_0.4.5      
##   [4] Hmisc_4.2-0          plyr_1.8.4           igraph_1.2.2        
##   [7] lazyeval_0.2.1       densityClust_0.3     fastICA_1.2-1       
##  [10] digest_0.6.18        foreach_1.4.4        htmltools_0.3.6     
##  [13] lars_1.2             gdata_2.18.0         magrittr_1.5        
##  [16] checkmate_1.9.1      memoise_1.1.0        cluster_2.0.7-1     
##  [19] mixtools_1.1.0       ROCR_1.0-7           limma_3.38.3        
##  [22] matrixStats_0.54.0   docopt_0.6.1         R.utils_2.7.0       
##  [25] prettyunits_1.0.2    colorspace_1.4-0     blob_1.1.1          
##  [28] xfun_0.4             sparsesvd_0.1-4      crayon_1.3.4        
##  [31] RCurl_1.95-4.11      jsonlite_1.6         bindr_0.1.1         
##  [34] survival_2.43-3      zoo_1.8-4            iterators_1.0.10    
##  [37] ape_5.2              glue_1.3.0           gtable_0.2.0        
##  [40] GetoptLong_0.1.7     kernlab_0.9-27       shape_1.4.4         
##  [43] prabclus_2.2-7       DEoptimR_1.0-8       scales_1.0.0        
##  [46] pheatmap_1.0.12      mvtnorm_1.0-8        DBI_1.0.0           
##  [49] bibtex_0.4.2         Rcpp_1.0.0           metap_1.1           
##  [52] dtw_1.20-1           progress_1.2.0       xtable_1.8-3        
##  [55] htmlTable_1.13.1     reticulate_1.10      foreign_0.8-71      
##  [58] bit_1.1-14           proxy_0.4-22         mclust_5.4.2        
##  [61] SDMTools_1.1-221     Formula_1.2-3        tsne_0.1-3          
##  [64] htmlwidgets_1.3      httr_1.4.0           FNN_1.1.2.2         
##  [67] gplots_3.0.1.1       RColorBrewer_1.1-2   fpc_2.1-11.1        
##  [70] acepack_1.4.1        modeltools_0.2-22    ica_1.0-2           
##  [73] pkgconfig_2.0.2      XML_3.98-1.17        R.methodsS3_1.7.1   
##  [76] flexmix_2.3-14       nnet_7.3-12          tidyselect_0.2.5    
##  [79] rlang_0.3.1          later_0.8.0          munsell_0.5.0       
##  [82] tools_3.5.1          RSQLite_2.1.1        ggridges_0.5.1      
##  [85] evaluate_0.12        stringr_1.4.0        yaml_2.2.0          
##  [88] npsurv_0.4-0         bit64_0.9-7          fitdistrplus_1.0-14 
##  [91] robustbase_0.93-3    caTools_1.17.1.1     purrr_0.3.0         
##  [94] RANN_2.6.1           bindrcpp_0.2.2       pbapply_1.4-0       
##  [97] nlme_3.1-137         mime_0.6             slam_0.1-44         
## [100] R.oo_1.22.0          hdf5r_1.0.1          compiler_3.5.1      
## [103] rstudioapi_0.9.0     png_0.1-7            lsei_1.2-0          
## [106] tibble_2.0.1         stringi_1.2.4        lattice_0.20-38     
## [109] trimcluster_0.1-2.1  HSMMSingleCell_1.2.0 pillar_1.3.1        
## [112] combinat_0.0-8       Rdpack_0.10-1        lmtest_0.9-36       
## [115] GlobalOptions_0.1.0  data.table_1.12.0    bitops_1.0-6        
## [118] gbRd_0.4-11          httpuv_1.4.5.1       R6_2.3.0            
## [121] latticeExtra_0.6-28  promises_1.0.1       KernSmooth_2.23-15  
## [124] codetools_0.2-16     MASS_7.3-51.1        gtools_3.8.1        
## [127] assertthat_0.2.0     rjson_0.2.20         withr_2.1.2         
## [130] qlcMatrix_0.9.7      hms_0.4.2            diptest_0.75-7      
## [133] doSNOW_1.0.16        rpart_4.1-13         tidyr_0.8.2         
## [136] class_7.3-15         rmarkdown_1.11       segmented_0.5-3.0   
## [139] Rtsne_0.15           base64enc_0.1-3
print(paste("Seurat ", packageVersion("Seurat")))
## [1] "Seurat  2.3.4"

Cluster Biomarkers

Seurat has several tests for differential expression which can be set with the test.use parameter (see the DE vignette for details). For example, the ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect).

Shown here: Biomarkers of each cluster vs. all other clusters.

Biomarkers Data

All Biomarkers

# Limit to only top variable genes?: 
# Set arg 'only.pos=F' to capture negative biomarkers
# DAT <- SetIdent(DAT, ident.use = "post_clustering") 

DAT.markers <- FindAllMarkers(object = DAT, min.pct = 0.25, thresh.use = 0.25,  only.pos = F,  
                              test.use = "wilcox") # DESeq2
DAT.markers <- DAT.markers %>% mutate(FC = 2^avg_logFC)

DAT.markers.sig <- DAT.markers %>% subset(p_val_adj<=0.05) 
markers.summary <- DAT.markers.sig %>% group_by(cluster) %>% tally()
# markers.summary <- base::merge(DAT.markers.sig %>% group_by(cluster) %>% tally(),
#       DAT.markers %>% group_by(cluster) %>% summarise(mean(avg_logFC)), 
#       by="cluster" )

createDT(markers.summary, caption = "Number of DEGs and Mean logFC per Cluster")
createDT(DAT.markers, caption = paste("All Biomarkers: All Clusters"))

Top Biomarkers

topNum = 5
topBiomarkers <- DAT.markers %>% group_by(cluster) %>% top_n(topNum, avg_logFC)
createDT(DAT.markers, caption = paste("All Biomarkers: All Clusters"))

Cluster Biomarker: Violin Plots

getTopBiomarker <- function(DAT.markers, clusterID, topN=1){
  df <-DAT.markers %>%
    subset(p_val_adj<0.05 & cluster==as.character(clusterID)) %>%
    arrange(desc(avg_logFC))
    top_pct_markers <- df[1:topN,"gene"]
  return(top_pct_markers)
}
# clust1_biomarkers <- getTopBiomarker(DAT.markers, clusterID=1, topN=2)
# clust2_biomarkers <- getTopBiomarker(DAT.markers, clusterID=2, topN=2)


### Plot biomarkers 
plotBiomarkers <- function(DAT, biomarkers, cluster){
  biomarkerPlots <- list()
  for (marker in biomarkers){ 
    p <- VlnPlot(object = DAT, features.plot = c(marker), y.log=T, return.plotlist=T) 
    biomarkerPlots[[marker]] <- p + ggplot2::aes(alpha=0.5) + xlab( "Cluster") + ylab( "Expression")
  }
  combinedPlot <- do.call(grid.arrange, c(biomarkerPlots, list(ncol=2, top=paste("Top DEG Biomarkers for Cluster",cluster))) ) 

  # biomarkerPlots <- lapply(biomarkers, function(marker) {
  #   VlnPlot(object = DAT, features.plot = c(marker), y.log=T, return.plotlist=T) %>% + ggplot2::ggtitle(marker) %>% ggplotly() 
  # })    
  # return(subplot(biomarkerPlots) )
}   

top1 <- DAT.markers %>% group_by(cluster) %>% top_n(1, avg_logFC) 
nCols <- floor( sqrt(length(unique(top1$cluster))) )   
figHeight <- nCols *7

# Plot top 2 biomarker genes for each 
for (clust in unique(DAT.markers$cluster)){ 
   cat('\n')   
   cat("### Cluster ",clust,"\n") 
   biomarkers <- getTopBiomarker(DAT.markers, clusterID=clust, topN=2)
   plotBiomarkers(DAT, biomarkers, clust)  
   cat('\n')   
} 

Cluster 0

Cluster 1

Cluster 2

Cluster Biomarker: Volcano Plots

##Construct the plot object
volcanoPlot <- function(DEG_df, caption="", topFC_labeled=5){
  DEG_df$sig<-  ifelse( DEG_df$p_val_adj<0.05 & DEG_df$avg_logFC<1.5, "p_val_adj<0.05",
            ifelse( DEG_df$p_val_adj<0.05  & DEG_df$avg_logFC>1.5, "p_val_adj<0.05 & avg_logFC>1.5",
                "p_val_adj>0.05"
        )) 
  DEG_df <- arrange(DEG_df, desc(sig))
  yMax  <- max(-log10(DEG_df$p_val_adj)) + max(-log10(DEG_df$p_val_adj))/3 #ifelse(max(-log10(DEG_df$p_val_adj))<45, 50, max(-log10(DEG_df$p_val_adj)) + 10)
  
  vol <- ggplot(data=DEG_df, aes(x=avg_logFC, y= -log10(p_val_adj))) +
    geom_point(alpha=0.5, size=3, aes(col=sig)) + 
    scale_color_manual(values=list("p_val_adj<0.05"="turquoise3",
                                   "p_val_adj<0.05 & avg_logFC>1.5"="purple", 
                                   "p_val_adj>0.05" = "darkgray")) +
    theme(legend.position = "none") + 
    xlab(expression(paste("Average ",log^{2},"(fold change)"))) +
    ylab(expression(paste(-log^{10},"(p-value)"))) + xlim(-2,2) + ylim(0, yMax) +
    ## ggrepl labels
    geom_text_repel(data= arrange(DEG_df,  p_val_adj, desc(avg_logFC))[1:topFC_labeled,], 
                    # filter(DEG_df, avg_logFC>=1.5)[1:10,],
                    aes(label=gene),  color="black", alpha=.5,
                    segment.color="black", segment.alpha=.5  
                    ) +  
    # Lines
    geom_vline(xintercept= -1.5,lty=4, lwd=.3, alpha=.5) + 
    geom_vline(xintercept= 1.5,lty=4, lwd=.3, alpha=.5) +
    geom_hline(yintercept= -log10(0.05),lty=4, lwd=.3, alpha=.5) + 
    ggtitle(caption) 
  print(vol)
}


# Run plots
for (clust in unique(DAT.markers$cluster)){
   cat('\n')   
   cat("### Cluster ",clust,": Volcano","\n") 
   cap <- paste("Cluster",clust,"DEG Table") 
   DEG_df <- subset(DAT.markers, cluster==as.character(clust)) %>% arrange(desc(avg_logFC))  
   volcanoPlot(DEG_df, caption = cap)
   createDT_html(DEG_df, caption = cap)
   cat('\n')   
} 

Cluster 0 : Volcano

Cluster 1 : Volcano

Cluster 2 : Volcano

## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

Top Biomarker Plot

Biomarkers GO

for (clust in top1$cluster){ 
  subClust <- subset( top1, cluster==clust)
  cat('\n')
  cat("### Cluster",clust,"\n")
  cat( "Biomarker\n",subClust$gene) 
  results <- Seurat::FindGeneTerms(QueryGene = subClust$gene) 
  print(results) #parse_html_notebook(results)
  cat('\n')
}

Cluster 0

Biomarker S100A12{xml_document} [1] CEBPA_26348894_ChIP-Seq_LIVER_Mouse [1]

TTF2_22483619_ChIP-Seq_HELA_Human …

Cluster 2

Biomarker FCER1A{xml_document} [1]

FOXA1_21572438_ChIP-Seq_LNCaP_Human

Biomarkers Heatmap

top5 <- DAT.markers %>% group_by(cluster) %>% top_n(5, avg_logFC)
# setting slim.col.label to TRUE will print just the cluster IDS instead of
# every cell name
DoHeatmap(object = DAT, genes.use = top5$gene, slim.col.label=T, remove.key=T) 

# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p))

Biomarkers Ridgeplot

RidgePlot(DAT, features.plot = top1$gene,  nCol = nCols, do.sort = F)
## Picking joint bandwidth of 0.299
## Picking joint bandwidth of 0.117
## Picking joint bandwidth of 0.119

Biomarkers Split Dot Plot

Visualize biomarker expression for each cluster, by disease

top2 <- DAT.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)

sdp <- SplitDotPlotGG(DAT, genes.plot = top2$gene, cols.use = c("blue","red"), 
                      x.lab.rot = T, plot.legend = T, dot.scale = 8, do.return = T, grouping.var = "dx")

Map Clusters to Known Biomarkers

  • Known Monocytes Biomarkers
    • Classical: CD14++ / CD16–
    • Intermediate: CD14++ / CD16+
    • Nonclassical: CD14+ / CD16++ (not captured in this data)

The following plots show the absolute expression of each biomarker, as opposed to avg_logFC which is dependent on the expression patterns of other cell types being compared.

Markers Dataframe

markerList <- c("CD14", "FCGR3A") 
 
get_markerDF <- function(DAT, markerList, meta_vars =c("barcode", "dx", "mut","post_clustering", "percent.mito","nGene", "nUMI")){
  exp <- DAT@scale.data %>% data.frame() 
  marker.matrix <- exp[row.names(exp) %in% markerList, ] 
  marker.matrix$Gene <- row.names(marker.matrix)
  markerMelt <- reshape2:::melt.data.frame(marker.matrix, id.vars = "Gene", variable.name = "Cell",value.name = "Expression") 
  metaSelect <-  DAT@meta.data[,meta_vars] 
  markerDF <- merge(markerMelt,metaSelect, by.x="Cell", by.y="barcode") 
  return(markerDF)
}
markerDF <- get_markerDF(DAT, markerList)
createDT(markerDF, caption = "Known Marker Expression")

Marker ANOVAs + Boxplots

# Explore expression differences between groups
marker_vs_metadata <- function(markerDF, meta_var){ 
  # Create title from ANOVA summary
  ANOVAtitle <- function(markerDF, marker){
      nTests <- length(unique(markerDF$Gene))
      res <- anova(lm(data = subset(markerDF, Gene==marker), 
                      formula = Expression ~ eval(parse(text=meta_var))))
      
      title <-paste(paste("ANOVA (",marker, " vs. ",meta_var, ")", sep=""), 
                    ": p=",round(res$`Pr(>F)`,3), 
                    ", F=",round(res$`F value`,3), 
        ifelse(res$`Pr(>F)`<.05/nTests,"(Significant**)",
               "(Non-significant)") ) 
  }
  
  title = ""
  for (marker in unique(markerDF$Gene) ){
    cat(marker)
    title <- paste(title, "\n", ANOVAtitle(markerDF, marker))
  } 
   
 
  ggplot(markerDF, aes(x=eval(parse(text=meta_var)), y=Expression, fill= Gene)) + 
    geom_violin() + 
      geom_point( position=position_jitterdodge(jitter.width = .2, dodge.width = .9 ),   alpha=0.6, color="turquoise3") + 
    labs(title = title, x=meta_var) +
    theme(plot.title = element_text( size=10)) +
    scale_fill_manual(values=c("brown", "slategray"))
  
}

ANOVA: dx

marker_vs_metadata(markerDF, "dx")
## CD14FCGR3A

ANOVA: mut

marker_vs_metadata(markerDF, "mut")
## CD14FCGR3A

Identify Cell Types with Garnett + Monocle

Pre-trained PBMC Classifer

# load("DAT.R")    
# Convert Seurat objt to CDS object
mDAT <- monocle::importCDS(DAT,  import_all = T) 
# generate size factors for normalization later
mDAT <- estimateSizeFactors(mDAT)
# Get pre-trained PBMC classifer 
load("./Garnet/hsPBMC") # Download from: https://cole-trapnell-lab.github.io/garnett/classifiers/hsPBMC 
# Get feature genes for each cell type
feature_genes <- get_feature_genes(classifier = hsPBMC,
                                   node = "root",
                                   db = org.Hs.eg.db, 
                                   convert_ids = T)
head(feature_genes)
mDAT <- classify_cells(mDAT, hsPBMC,
                           db = org.Hs.eg.db,
                           cluster_extend = TRUE,
                           cds_gene_id_type = "SYMBOL")
head(pData(mDAT))
table(pData(mDAT)$cell_type)
table(pData(mDAT)$cluster_ext_type) 



# Run tSNE: Plot Clusters and Cell Types 
mDAT <- reduceDimension(mDAT, max_components = 3, reduction_method = "tSNE") 
commonGeoms <- labs(x="tSNE1",y="tSNE2")

plot_grid(nrow = 2,
  qplot(data = pData(mDAT), mDAT@reducedDimA[1,], mDAT@reducedDimA[2,], color = cell_type) + theme_bw() + commonGeoms,
  qplot(data = pData(mDAT), mDAT@reducedDimA[1,], mDAT@reducedDimA[2,], color = cluster_ext_type) + theme_bw() + commonGeoms
)


# Unsupervised Clustering
mDAT <- clusterCells(mDAT, num_clusters = 5)
pData(mDAT)
plot_cell_clusters(mDAT, 1, 2, color = "Cluster",  markers = c("CD14", "FCGR3A"))
plot_cell_clusters(mDAT, 1, 2, color = "Cluster") + facet_wrap(~dx)
plot_cell_clusters(mDAT, 1, 2, color = "Cluster") + facet_wrap(~mut)
plot_cell_clusters(mDAT, 1, 2, color = "Cluster") + facet_wrap(~cell_type) 

DAT <- Seurat::AddMetaData(DAT, pData(mDAT)[c("garnett_cluster","cell_type","cluster_ext_type","Cluster")])

Train Classifier on Data

Pseudo-time

WARNING: Very computationally expensive!

mDAT <- reduceDimension(mDAT, max_components = 2, method = 'DDRTree')
mDAT <- orderCells(mDAT)
plot_cell_trajectory(mDAT, color_by = "dx")
plot_cell_trajectory(mDAT, color_by = "cell_type")
plot_cell_trajectory(HSMM_myo, color_by = "cell_type") +
    facet_wrap(~mut, nrow = 2)

Known Biomarkers: Heatmaps

Cells Separated

markerDF <- get_markerDF(DAT, markerList, 
             meta_vars =c("barcode", "dx", "mut","ID","post_clustering", "percent.mito","nGene", "nUMI" )) #cell_type
Spectral <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(length(unique(DAT@meta.data$mut)), "Spectral"))

# DAT <- DoKMeans(DAT, k.genes = 3) 
# KMeansHeatmap(DAT)

if (T==F){
  # Spectral <- heatmaply::Spectral(length(unique(DAT@meta.data$mut)))
  markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0) 
  heatmaply::heatmaply(markerMelt,  key.title="Expression",#plot_method= "ggplot",
        k_row = dim(markerMelt)[2], dendrogram = "row",
        showticklabels = c(T, F), xlab = "Known Markers", ylab = "Cells", column_text_angle = 45, 
        row_side_colors =  DAT@meta.data[,c("dx","mut")], #cell_type
        row_side_palette = Spectral
        )  %>%  colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 2)  %>% 
        colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 1)
 }else{ 
  # markerDF_sub <-subset(markerDF, Gene==markerList[1])  
  # var_to_colors(markerDF_sub, "post_clustering")  
  # library(pheatmap)
  # pheatmap(markerMelt, annotation_row = markerDF_sub[c("dx","mut","cell_type")])
  # pheatmap(markerMelt, kmeans_k = NA, annotation_row = markerDF_sub[c("dx","mut","cell_type")],
  #         cluster_cols = F, cutree_rows = length(unique(markerDF$post_clustering)),  angle_col=45 )
  library(RColorBrewer) 
  var_to_colors <- function(markerDF, metaVar){
    colors <- brewer.pal(length(unique(markerDF[metaVar]) ), "Dark2")
     sample(colors, length(unique(markerDF[metaVar])), replace = TRUE, prob = NULL)
    # metaColors <- colors[ subset(markerDF, Gene==markerList[1])[metaVar][,1] %>% as.factor() ]
    return(metaColors)
  }  
  # library(GMD)
  # myCols = cbind(var_to_colors(markerDF, "dx"), var_to_colors(markerDF, "mut")) 
  # rlab=t(cbind(
  #   var_to_colors(markerDF, "post_clustering"),
  #   var_to_colors(markerDF, "dx")
  #   ))
  #   heatmap.2(marker.matrix, key.title="Expression",  col = viridis(300), trace="none",Colv = F, Rowv = F,
  #             labRow = F, xlab = "Biomarker", ylab="Cell", cexCol=1, RowSideColors = var_to_colors(markerDF, "post_clustering")
  #             )
  # heatmap.3(markerMelt, dendrogram = 'row', kr = length(unique(markerDF)), labRow = F, 
  #           xlab = "Biomarker", ylab = "Cell", RowSideColors = rlab, RowSideColorsSize=2 )
   
  
  
  markerDF <- markerDF %>%    
    mutate_at(vars(post_clustering, dx, mut, ID), as.factor) %>% #cell_type
    mutate(Cluster = post_clustering) %>%
    arrange(post_clustering) 
  # markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0) 
  markerMelt <- dcast(markerDF,  Cell + post_clustering + dx + mut + ID  ~ Gene, #+ cell_type
                      fun.aggregate = mean, value.var = "Expression") %>% arrange(post_clustering)
  marker.matrix <- markerMelt[markerList] %>%as.matrix()
  row.names(marker.matrix) <- markerMelt$Cell
  
  ha = HeatmapAnnotation(df = markerDF[c("dx","mut","ID","post_clustering")], which = "row")  #cell_type
  
  ComplexHeatmap::Heatmap(marker.matrix, col=viridis(300), column_title = "Biomarker", row_title = "Cell",  
                          row_dend_reorder = F,show_row_names = F, show_column_dend = F,show_row_dend =T,
                          cluster_rows = T, column_title_side = "bottom",km = length(unique(markerMelt$post_clustering))) + ha
 

} 

Average Expression: By Clusters

markerDF <- markerDF %>% mutate(Cluster = post_clustering)
# Show mean exp for each marker
avgMarker <- markerDF %>% group_by(Gene, Cluster) %>% summarise(meanExp = mean(Expression)) 

p <- ggplot(data = avgMarker, aes(x=Gene, y=Cluster, fill=meanExp)) %>% + geom_tile() + scale_fill_viridis()
p

# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p))

Average Expression: By Disease

# Show mean exp for each marker
avgMarker <- markerDF %>% group_by(Gene, dx, Cluster) %>% summarise(meanExp = mean(Expression)) 
p <- ggplot(data = avgMarker, aes(x=Gene, y=dx, fill=meanExp)) %>% + geom_tile() + scale_fill_viridis()
p 

# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p))

Known Biomarkers: Boxplot

ggplot(data = markerDF, aes(x=Cluster, y=Expression, fill=Gene)) %>% 
  + geom_boxplot(alpha=0.5) %>% + scale_fill_manual(values=c("purple", "turquoise"))  

Known Biomarkers: tSNE

#, results = 'hide', fig.show='hide'
expressionTSNE <- function(DAT, marker, colors=c("grey", "red")){
   FeaturePlot(object = DAT, features.plot = marker, cols.use = colors, 
    reduction.use = "tsne", nCol=2, do.return = T, dark.theme = T)[[1]]
  # p <- ifelse(interactive, p %>% ggplotly() %>% toWebGL(), print(p)) 
}
 plot_grid(expressionTSNE(DAT, markerList[1]),
 expressionTSNE(DAT, markerList[2], colors=c("grey", "green")))

Label Clusters by DGE Biomarkers

current.cluster.ids <- unique(DAT.markers$cluster) #c(0, 1, 2, 3, 4, 5, 6, 7)
top1 <- DAT.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
new.cluster.ids <- top1$gene #c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")

DAT@ident <- plyr::mapvalues(x = DAT@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(object=DAT, do.label=T, pt.size=0.5, do.return=T) 

# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p)) 

DGE: All Cells

  • DGE methods available in Seurat include:
  • DESeq2DETest()
  • DiffExpTest()
  • DiffTTest()
# Available DGE methods:
## "wilcox", "bimod", "roc", "t", "tobit", "poisson", "negbinom", "MAST", "DESeq2"
runDGE <- function(DAT, meta_var, group1, group2, test.use="wilcox"){
  #print(paste("DGE_allCells",meta_var,sep="_")) 
  DAT <- SetAllIdent(DAT, id = meta_var)
  DAT <- StashIdent(DAT, save.name = meta_var)  
  DEGs <- FindMarkers(DAT, ident.1=group1, ident.2=group2, test.use=test.use, only.pos = F)
  DEGs$gene <- row.names(DEGs)
  
  cap <- paste("DEGs:\n",group1, "vs.", group2)
  createDT_html(DEG_df, caption = cap)
  volcanoPlot(DEG_df, caption = cap)
  
  DAT <- SetAllIdent(DAT, id = "post_clustering")
  return(DEGs)
}

PD vs. Controls

DEG_df <-runDGE(DAT, "dx", group1 = "PD", group2="control")
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

LRRK vs. PD

DEG_df <-runDGE(DAT, "mut", "LRRK2", "PD")

cell_type

DEG_df <-runDGE(DAT, "cell_type", "Monocytes", "Dendritic cells") 

DGE: Within Clusters

DGE_within_clusters <- function(DAT, meta_var, group1, group2){ 
  for (clust in unique(DAT@meta.data$post_clustering)){
    # Subset cells by cluster  
   cat('\n')   
   cat("### ",paste("Cluster ",clust,": ",group1," vs. ", group2, sep="") , "\n")
   DAT_clustSub <- Seurat::SubsetData(DAT, accept.value = clust, subset.raw = T)  
   DEG_df <-runDGE(DAT_clustSub, meta_var, group1, group2 ) 
   cat('\n')   
  } 
}

Between Disease Groups

DGE_within_clusters(DAT, "dx", "PD", "control")    

Cluster 2: PD vs. control

Cluster 0: PD vs. control

Cluster 1: PD vs. control

Between Mutation Groups

DGE_within_clusters(DAT, "mut", "LRRK2", "PD")

Cluster 2: LRRK2 vs. PD

Cluster 0: LRRK2 vs. PD

Cluster 1: LRRK2 vs. PD

Between cell_type

DGE_within_clusters(DAT, "cell_type", "Monocytes", "Dendritic cells") 
DAT@meta.data$cell_type %>% table()

Try Different Cluster Resolutions

If you perturb some of our parameter choices above (for example, setting resolution=0.8 or changing the number of PCs), you might see the CD4 T cells subdivide into two groups. You can explore this subdivision to find markers separating the two T cell subsets. However, before reclustering (which will overwrite object@ident), we can stash our renamed identities to be easily recovered later.

Find New Clusters

new_resolution <- 3.0
orig_resolution <- paste("resolution",resolution,sep="_")
DAT <- StashIdent(object = DAT, save.name = orig_resolution) 

## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Build parameters exactly match those of already
## computed and stored SNN. To force recalculation, set force.recalc to TRUE.
DAT <- FindClusters(object = DAT, reduction.type = "pca", dims.use = 1:10,
                     resolution = new_resolution, print.output = F)
DAT <- StashIdent(object = DAT, save.name = "resolution_3.0") 

plot1 <- TSNEPlot(object = DAT, do.return = TRUE, no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot2 <- TSNEPlot(object = DAT, do.return = TRUE, group.by = "post_clustering", 
                  no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot_grid(plot1, plot2)

Find New Biomarkers

res3.0_markers <- FindAllMarkers(object = DAT, min.pct = 0.25, thresh.use = 0.25,  only.pos = F,  test.use = "wilcox")
top1_res3.0 <- res3.0_markers %>% group_by(cluster) %>% top_n(1, avg_logFC) 

FeaturePlot(object = DAT, features.plot = top1_res3.0$gene, cols.use = c("green", "blue"))

# Set back to orig
DAT <- SetAllIdent(object = DAT, id = orig_resolution) 

Save Results

save.image(file.path(resultsPath, "scRNAseq_results.RData"))   
htmltools::includeHTML("./scRNAseq_Enrichment.html")
Enrichment

Setup

root = "./"
# Import functions
source(file.path(root, "./general_functions.R"))
import_parameters(params)
## **** __Utilized Cores__ **** = 2$subsetGenes
## [1] "protein_coding"
## 
## $subsetCells
## [1] 500
## 
## $resolution
## [1] 0.6
## 
## $resultsPath
## [1] "./Results"
## 
## $nCores
## [1] 2
## 
## $perplexity
## [1] 30
load(file.path(resultsPath, "scRNAseq_results.RData"))

./Results

Load Libraries

library(Seurat)
library(dplyr)
library(gridExtra)
library(knitr) 
library(plotly)
library(ggplot2)
library(viridis)
library(reshape2)
library(shiny) 
library(ggrepel)
library(DT) 
library(ComplexHeatmap); #BiocManager::install("ComplexHeatmap") 
  
## Install Bioconductor
#  if (!requireNamespace("BiocManager"))
#     install.packages("BiocManager") 
library(biomaRt) # BiocManager::install(c("biomaRt"))
library(DESeq2) # BiocManager::install(c("DESeq2"))
library(enrichR) #BiocManager::install("enrichR")

library(monocle) #BiocManager::install("monocle")
# BiocManager::install("DelayedMatrixStats")
# BiocManager::install("org.Mm.eg.db") 
library(org.Hs.eg.db)
library(garnett) # devtools::install_github("cole-trapnell-lab/garnett") 
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] splines   parallel  stats4    grid      stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] garnett_0.1.1               org.Hs.eg.db_3.7.0         
##  [3] AnnotationDbi_1.44.0        monocle_2.10.1             
##  [5] DDRTree_0.1.5               irlba_2.3.3                
##  [7] VGAM_1.0-6                  enrichR_1.0                
##  [9] DESeq2_1.22.2               SummarizedExperiment_1.12.0
## [11] DelayedArray_0.8.0          BiocParallel_1.16.5        
## [13] matrixStats_0.54.0          Biobase_2.42.0             
## [15] GenomicRanges_1.34.0        GenomeInfoDb_1.18.1        
## [17] IRanges_2.16.0              S4Vectors_0.20.1           
## [19] BiocGenerics_0.28.0         biomaRt_2.38.0             
## [21] ComplexHeatmap_1.20.0       DT_0.5.1                   
## [23] ggrepel_0.8.0               shiny_1.2.0                
## [25] reshape2_1.4.3              viridis_0.5.1              
## [27] viridisLite_0.3.0           plotly_4.8.0               
## [29] knitr_1.21                  gridExtra_2.3              
## [31] dplyr_0.7.8                 Seurat_2.3.4               
## [33] Matrix_1.2-15               cowplot_0.9.4              
## [35] ggplot2_3.1.0              
## 
## loaded via a namespace (and not attached):
##   [1] reticulate_1.10        R.utils_2.7.0          tidyselect_0.2.5      
##   [4] RSQLite_2.1.1          htmlwidgets_1.3        docopt_0.6.1          
##   [7] combinat_0.0-8         trimcluster_0.1-2.1    Rtsne_0.15            
##  [10] munsell_0.5.0          codetools_0.2-16       ica_1.0-2             
##  [13] withr_2.1.2            fastICA_1.2-1          colorspace_1.4-0      
##  [16] rstudioapi_0.9.0       ROCR_1.0-7             robustbase_0.93-3     
##  [19] dtw_1.20-1             gbRd_0.4-11            Rdpack_0.10-1         
##  [22] lars_1.2               slam_0.1-44            GenomeInfoDbData_1.2.0
##  [25] bit64_0.9-7            pheatmap_1.0.12        xfun_0.4              
##  [28] diptest_0.75-7         R6_2.3.0               locfit_1.5-9.1        
##  [31] hdf5r_1.0.1            flexmix_2.3-14         bitops_1.0-6          
##  [34] assertthat_0.2.0       promises_1.0.1         SDMTools_1.1-221      
##  [37] scales_1.0.0           nnet_7.3-12            gtable_0.2.0          
##  [40] npsurv_0.4-0           rlang_0.3.1            genefilter_1.64.0     
##  [43] GlobalOptions_0.1.0    lazyeval_0.2.1         acepack_1.4.1         
##  [46] checkmate_1.9.1        yaml_2.2.0             backports_1.1.3       
##  [49] httpuv_1.4.5.1         Hmisc_4.2-0            tools_3.5.1           
##  [52] gplots_3.0.1.1         RColorBrewer_1.1-2     proxy_0.4-22          
##  [55] ggridges_0.5.1         Rcpp_1.0.0             plyr_1.8.4            
##  [58] base64enc_0.1-3        progress_1.2.0         zlibbioc_1.28.0       
##  [61] purrr_0.3.0            RCurl_1.95-4.11        densityClust_0.3      
##  [64] prettyunits_1.0.2      rpart_4.1-13           pbapply_1.4-0         
##  [67] GetoptLong_0.1.7       zoo_1.8-4              cluster_2.0.7-1       
##  [70] magrittr_1.5           data.table_1.12.0      circlize_0.4.5        
##  [73] lmtest_0.9-36          RANN_2.6.1             mvtnorm_1.0-8         
##  [76] fitdistrplus_1.0-14    hms_0.4.2              lsei_1.2-0            
##  [79] mime_0.6               evaluate_0.12          xtable_1.8-3          
##  [82] XML_3.98-1.17          sparsesvd_0.1-4        mclust_5.4.2          
##  [85] shape_1.4.4            HSMMSingleCell_1.2.0   compiler_3.5.1        
##  [88] tibble_2.0.1           KernSmooth_2.23-15     crayon_1.3.4          
##  [91] R.oo_1.22.0            htmltools_0.3.6        segmented_0.5-3.0     
##  [94] later_0.8.0            Formula_1.2-3          snow_0.4-3            
##  [97] tidyr_0.8.2            geneplotter_1.60.0     DBI_1.0.0             
## [100] MASS_7.3-51.1          fpc_2.1-11.1           R.methodsS3_1.7.1     
## [103] gdata_2.18.0           metap_1.1              bindr_0.1.1           
## [106] igraph_1.2.2           pkgconfig_2.0.2        foreign_0.8-71        
## [109] foreach_1.4.4          annotate_1.60.0        XVector_0.22.0        
## [112] bibtex_0.4.2           stringr_1.4.0          digest_0.6.18         
## [115] tsne_0.1-3             rmarkdown_1.11         htmlTable_1.13.1      
## [118] kernlab_0.9-27         gtools_3.8.1           modeltools_0.2-22     
## [121] rjson_0.2.20           nlme_3.1-137           jsonlite_1.6          
## [124] bindrcpp_0.2.2         limma_3.38.3           pillar_1.3.1          
## [127] lattice_0.20-38        httr_1.4.0             DEoptimR_1.0-8        
## [130] survival_2.43-3        glue_1.3.0             qlcMatrix_0.9.7       
## [133] FNN_1.1.2.2            png_0.1-7              prabclus_2.2-7        
## [136] iterators_1.0.10       bit_1.1-14             class_7.3-15          
## [139] stringi_1.2.4          mixtools_1.1.0         blob_1.1.1            
## [142] doSNOW_1.0.16          latticeExtra_0.6-28    caTools_1.17.1.1      
## [145] memoise_1.1.0          ape_5.2
print(paste("Seurat ", packageVersion("Seurat")))
## [1] "Seurat  2.3.4"

Enrichment

enrichr_dbs <- c("KEGG_2018", "Reactome_2016",
                 "GO_Biological_Process_2018", "GO_Molecular_Function_2018", "GO_Cellular_Component_2018", 
                 "Rare_Diseases_AutoRIF_ARCHS4_Predictions", "Human_Gene_Atlas")
createDT(enrichR::listEnrichrDbs(), "Enrichr Databases")

Enrichr on Clusters

for (clust in unique(DAT.markers.sig$cluster)){
  cat('\n')
  cat("### Cluster ",clust,"{.tabset .tabset-fade}\n")
  geneList <- subset(DAT.markers.sig, cluster==clust)$gene  %>% as.character()
  results <- enrichr(genes = geneList, databases = enrichr_dbs )
  for (db in enrichr_dbs){
    cat('\n')
    cat("#### ",db,"\n")  
    createDT_html(subset(results[[db]], Adjusted.P.value<=0.05), paste("Enrichr Results: ",db,"Cluster ", clust))
    cat('\n')
  } 
  cat('\n')
} 

Cluster 0

Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.

KEGG_2018

Reactome_2016

GO_Biological_Process_2018

GO_Molecular_Function_2018

GO_Cellular_Component_2018

Rare_Diseases_AutoRIF_ARCHS4_Predictions

Human_Gene_Atlas

Cluster 1

Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.

KEGG_2018

Reactome_2016

GO_Biological_Process_2018

GO_Molecular_Function_2018

GO_Cellular_Component_2018

Rare_Diseases_AutoRIF_ARCHS4_Predictions

Human_Gene_Atlas

Cluster 2

Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.

KEGG_2018

Reactome_2016

GO_Biological_Process_2018

GO_Molecular_Function_2018

GO_Cellular_Component_2018

Rare_Diseases_AutoRIF_ARCHS4_Predictions

Human_Gene_Atlas

Enrichr on WGCNA Modules

  • Background: Bulk RNA-seq was conducted on monocytes extracted from the blood of controls and PD patients. Katia Lopes conducted Weighted Correlation Network Analysis (WGCNA) on these sameples and identified co-expression modules.
  • Objective: Determine whether any of these modules are representative of cell groups in our scRNA-seq monocytes data.
eigengenes <- read.delim(file.path(root,"Data/bulkMonocytes_WGCNAmodules_geneMembership.txt"), row.names = NULL)
modules <- read.delim(file.path(root,"Data/bulkMonocytes_WGCNAmodules_geneModules.txt"), row.names = NULL, sep = "", 
                      col.names = c("Ensembl","moduleColors")) 
modules <- base::merge(eigengenes, modules,by="Ensembl" )

for (mod in unique(modules$moduleColors)){
  cat('\n')
  cat("### Module ",mod,"{.tabset .tabset-fade}\n")
  geneList <- subset(modules, moduleColors==mod)$symbol %>% as.character()
  results <- enrichr(genes = geneList, databases = enrichr_dbs )
  for (db in enrichr_dbs){
    cat('\n')
    cat("#### ",db,"\n")  
    createDT_html(subset(results[[db]], Adjusted.P.value<=0.05), paste("Enrichr Results:",db,"Module", mod))
    cat('\n')
  } 
  cat('\n')
}

Module blue

Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.

KEGG_2018

Reactome_2016

GO_Biological_Process_2018

GO_Molecular_Function_2018

GO_Cellular_Component_2018

Rare_Diseases_AutoRIF_ARCHS4_Predictions

Human_Gene_Atlas

Module red

Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.

KEGG_2018

Reactome_2016

GO_Biological_Process_2018

GO_Molecular_Function_2018

GO_Cellular_Component_2018

Rare_Diseases_AutoRIF_ARCHS4_Predictions

Human_Gene_Atlas

Module turquoise

Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.

KEGG_2018

Reactome_2016

GO_Biological_Process_2018

GO_Molecular_Function_2018

GO_Cellular_Component_2018

Rare_Diseases_AutoRIF_ARCHS4_Predictions

## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Human_Gene_Atlas

Module green

Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.

KEGG_2018

Reactome_2016

GO_Biological_Process_2018

GO_Molecular_Function_2018

GO_Cellular_Component_2018

Rare_Diseases_AutoRIF_ARCHS4_Predictions

Human_Gene_Atlas

Module pink

Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.

KEGG_2018

Reactome_2016

GO_Biological_Process_2018

GO_Molecular_Function_2018

GO_Cellular_Component_2018

Rare_Diseases_AutoRIF_ARCHS4_Predictions

Human_Gene_Atlas

Module black

Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.

KEGG_2018

Reactome_2016

GO_Biological_Process_2018

GO_Molecular_Function_2018

GO_Cellular_Component_2018

Rare_Diseases_AutoRIF_ARCHS4_Predictions

Human_Gene_Atlas

WGCNA Eigengenes

Determine whether each of the clusters in scRNA-seq data are enriched for WGCNA eigengenes (a weighted vector of all genes representing each co-expression module).

https://ucdavis-bioinformatics-training.github.io/2017_2018-single-cell-RNA-sequencing-Workshop-UCD_UCB_UCSF/day3/scRNA_Workshop-PART6.html

#Get the average expression of every gene in each cluster
allGenes <- get_markerDF(DAT, markerList = row.names(DAT@scale.data), meta_vars = c("post_clustering", "barcode") )

clusterGeneAvg <- allGenes %>% group_by(post_clustering, Gene) %>% summarise(meanExp = mean(Expression))
eigengenes_filt <-  subset(eigengenes,symbol %in%  unique(clusterGeneAvg$Gene))


clusts_by_mods <- base::merge(clusterGeneAvg[c("Gene","meanExp")], eigengenes_filt[c("symbol", modName)], 
                              by.x="Gene", by.y="symbol")


cor.test()
corrplot()
heatmap.2
 

f <- function(module){
  eigengene <-  eigengenes[paste0("MM", mod)]
  means <- tapply(eigengenes, DAT@meta.data$post_clustering, mean, na.rm = T)
  return(means)
}
modules <- c("blue", "brown", "green", "turquoise", "yellow")
plotdat <- sapply(modules, f)
matplot(plotdat, col = modules, type = "l", lwd = 2, xaxt = "n", xlab = "Seurat Cluster",
        ylab = "WGCNA Module Eigengene")
axis(1, at = 1:16, labels = 0:15)
matpoints(plotdat, col = modules, pch = 21)

RRHO

library(RRHO) #BiocManager::install("RRHO")

# list.length <- 100
#  list.names <- paste('Gene',1:list.length, sep='')
# gene.list1<- data.frame(list.names, sample(100))
# gene.list2<- data.frame(list.names, sample(100))


for (clust in unique(DAT.markers.sig)){
  # Compare each cluster
  subClust <- subset(DAT.markers.sig, cluster==clust)  %>% arrange(desc(avg_logFC))
  
  for (mod in unique(modules$moduleColors)){ 
    # Sort genes by module membership
    modName <-paste("MM",mod,sep="")
    subMod <- subset(modules, moduleColors==mod) %>% arrange(desc(eval(parse(text = modName))))
    maxGenes <- min(length(subClust$gene), subMod$symbol) %>% as.numeric()
    
    list1 <- subClust[1:maxGenes, c("gene","FC")] %>% dplyr::rename(value=FC)
    list2 <- subMod[1:maxGenes, c("symbol",modName)] %>% dplyr::rename(gene=symbol, value=modName)
    
    RRHO_path <-file.path("RRHO_results",paste(paste("Cluster",clust,sep=""),"vs",modName,sep="_"))
    dir.create(RRHO_path,recursive = T, showWarnings = F)
    
    RRHO_results <- RRHO(list1=list1, list2=list2,
         labels = c(paste("Cluster",clust,sep="_"), paste("Module",mod,sep="_")), 
         plots = T, alternative = "enrichment", outputdir = RRHO_path, BY=TRUE
         )
    lattice::levelplot(RRHO_results$hypermat) 
    # Pval testing
    pval.testing <- pvalRRHO(RRHO_results, 50)
    pval.testing$pval
    xs<- seq(0, 10, length=100)
    plot(Vectorize(pval.testing$FUN.ecdf)(xs)~xs, xlab='-log(pvalue)', ylab='ECDF', type='S')
    lattice::levelplot(RRHO_results$hypermat.by)
  } 
} 

Save Results

save.image(file.path(resultsPath, "scRNAseq_results.RData"))